/***************************************************************************************
Firm-embedded productivity and cross-country income differences
Alviarez, Cravino and Ramondo
Journal of Political Economy (2022)

Program: clean_01_orbis.do
Date: October 2022

Description: Clean the raw Orbis data for firm level analysis 

*****************************************************************************************/
clear all
set more off
global typeden=0 
do "set_directories.do"



*=======================================================================
*Ceaning ORBIS firm level data
*=======================================================================
use "${intermediate}/orbis-mult_short_final_all_links_20200109.dta", clear
gen isocode=substr(bvd_id,1,2)
gen hq=substr(guo_50,1,2)
	  

*Eliminate observations without revenue or employment information
*----------------------------------------------------------------
gen data_noavailable=((turn==. | turn==0) & (emp==. | emp==0))
tab data_noavailable
drop if data_noavailable==1
drop data_noavailable


*Eliminate observations with negative revenues 
*----------------------------------------------------------------
gen data_negative=(turn<0)
tab data_negative
drop if data_negative==1
drop data_negative


*Deal with repeated observations 
*----------------------------------------------------------------
drop duplicates 
duplicates tag bvd_id year, gen(dupli)
tab dupli

gen data_rev=(turn!=. & turn!=0)
gen data_emp=(emp!=. & emp!=0)
by bvd_id year, sort: egen data_rev_max=max(data_rev)
by bvd_id year, sort: egen data_emp_max=max(data_emp)

gen gturn=-turn
sort bvd_id year gturn
by bvd_id year, sort: gen a=_n
drop if dupli==1 & data_rev_max==1 & a>1
drop dupli a gturn
duplicates tag bvd_id year, gen(dupli)

gen gemp=-emp
sort bvd_id year gemp
by bvd_id year, sort: gen a=_n
drop if dupli==1 & data_emp_max==1 & a>1
drop dupli a gemp
duplicates tag bvd_id year, gen(dupli)
drop dupli


*Keep MNCs only 
*----------------------------------------------------------------
by year guo_50 isocode, sort: gen a=_n==1
tab a
by year guo_50, sort: egen num_iso=total(a)
drop a
gen MNC=(num_iso>1)
tab MNC
tab MNC if isocode!=hq
drop if MNC==0 & isocode==hq
tab MNC
drop if MNC==0 & isocode!=hq
drop MNC

compress
tempfile temp0
save `temp0', replace 



*Keep and rename key variables  
*----------------------------------------------------------------
use `temp0', clear
keep bvd_id con_code turn emp year guo_50 guo_50c country_code_iso entity industry_code entity_guo50 industry_code_guo50 entity_guo50c industry_code_guo50c exp_rev val_added year_incorp_date isocode hq

rename bvd_id id_bvd
rename guo_50 guo_bvd 
rename guo_50c guo_bvdc

rename entity entity_id
rename entity_guo50 entity_guo
rename entity_guo50c entity_guoc

rename industry_code NAICS
rename industry_code_guo50 NAICS_guo
rename industry_code_guo50c NAICS_guoc

rename turn sales
rename exp_rev exports 
rename val_added va
rename year_incorp_date year_first
gen age=year-year_first
replace age=. if age<0
drop year_first
gen hqc=substr(guo_bvdc,1,2)

label var guo_bvdc "GUO (min 50%) only with type B, C, A and F"
label var guo_bvd "GUO (min 50%)"


*Entity Type: 
*--------------------------------------------------------
*B: Bank 
*F: Finacial Company 
*A: Insurance Company 
*C: Industrial Company
*E: Mutual & Pension Fund/Nominee/Trust/Trustee
*J: Foundation/Research Institute
*S: Public authorities, States, Governments
*I: One or more known individuals or families
*M: Employees/Managers/Directors
*H: Self ownership
*P: Private equity firms
*Z: Public
*D: Unnamed private shareholders, aggregated
*L: Other unnamed shareholders, aggregated
*V: Venture capital
*Y: Hedge fund
*Q: Branch 
*W: Mrine Vessels 


replace hq=hqc if (hq=="WW" | hq=="YY" | hq=="ZZ" | hq=="XX" | hq=="" )

*Use guo (as above), but use the mode hqc (to scape tax haven situations) 
by guo_bvd, sort: egen xxx_mode=mode(hqc), maxmode
gen hq2=xxx_mode
replace hq2=hq if hq2==""
drop hq
rename hq2 hq
drop xxx_mode

*Use guoc and hqc (and guo/hq if not available) 
replace guo_bvdc=guo_bvd if guo_bvdc==""
replace hqc=hq if hqc==""


global cond1 `"  isocode=="AR" | isocode=="AT" | isocode=="AU" | isocode=="BE" | isocode=="BG" | isocode=="BR" | isocode=="CA" |  isocode=="CH" | isocode=="CL" | isocode=="CN" | isocode=="CY" | isocode=="CO" | isocode=="CZ" |  isocode=="DE" |  isocode=="DK" |  isocode=="EE" |  isocode=="ES" |  isocode=="FI" |  isocode=="FR" |  isocode=="GB" |  isocode=="GR" |  isocode=="HK" |  isocode=="HU" |  isocode=="HR" |  isocode=="ID" |  isocode=="IL" |  isocode=="IE" | isocode=="IN" |  isocode=="IS" |  isocode=="IT" |  isocode=="JP" |  isocode=="KR" | isocode=="LT" | isocode=="LV" | isocode=="LU" |  isocode=="MX" |  isocode=="MT" |  isocode=="MY" |  isocode=="NL" |   isocode=="NO" |  isocode=="NZ" |  isocode=="PE" |  isocode=="PL" |  isocode=="PT" |  isocode=="RO" |  isocode=="RS" |   isocode=="RU" | isocode=="SE" |  isocode=="SG" | isocode=="SI" | isocode=="SK" | isocode=="TH" | isocode=="TR" | isocode=="TW" | isocode=="UA" |  isocode=="US" | isocode=="VE" |  isocode=="ZA"   "'
gen a=($cond1)
tab a
keep if a==1
drop a
compress
order year isocode id_bvd NAICS guo_bvd hq guo_bvdc hqc isocode sales emp va exports age


*Sector classification* 
*--------------------------------------------------------------------
tostring NAICS*, replace
replace NAICS_guo="" if NAICS_guo=="."
replace NAICS_guoc="" if NAICS_guoc=="."
replace NAICS="" if NAICS=="."

rename NAICS NAICS4
gen NAICS2=substr(NAICS4,1,2)
gen NAICS3=substr(NAICS4,1,3)

gen sector="NA"
replace sector="NA" if NAICS2=="" | NAICS2=="NA"
replace sector="Agriculture (A)" if NAICS2=="11"
replace sector="Mining (B)" if NAICS2=="21"

replace sector="Food (10-12)" if NAICS3=="311" | NAICS3=="312"
replace sector="Textiles (13-15)" if NAICS3=="313" | NAICS3=="314" | NAICS3=="315" | NAICS3=="316"
replace sector="Wood (16-18)" if NAICS3=="321" | NAICS3=="322" | NAICS3=="323"
replace sector="Petroleum (19)" if NAICS3=="324"
replace sector="Chemicals (20-21)" if NAICS3=="325"
replace sector="Plastic (22-23)" if NAICS3=="326" | NAICS3=="327"
replace sector="Basic Metals (24-25)" if NAICS3=="331" | NAICS3=="332"
replace sector="Electrical_Equip (26-27)" if NAICS3=="334" |  NAICS3=="335" 
replace sector="Machinery (28)" if NAICS3=="333" 
replace sector="Transport (29-30)" if NAICS3=="336"
replace sector="Other_Manufacturing (31-33)" if NAICS3=="337" | NAICS3=="339"

replace sector="Electricity (D-E)" if NAICS2=="22"
replace sector="Construction (F)" if NAICS2=="23"
replace sector="Wholesale_Retail (G)" if NAICS2=="41" | NAICS2=="42" | NAICS2=="44" | NAICS2=="45"
replace sector="Transportation_Storage (H)" if NAICS2=="48" | NAICS2=="49"
replace sector="Accomodation (I)" if NAICS2=="72"
replace sector="Information (J)" if  NAICS2=="51"
replace sector="Financial_Insurance (K)" if  NAICS2=="52"
replace sector="Real_Estate (L)" if  NAICS2=="53"
replace sector="Support_Services (M-N)" if NAICS2=="54" | NAICS2=="55" | NAICS2=="56"
replace sector="Public_Administration (O)" if NAICS2=="92"
replace sector="Education (P)" if  NAICS2=="61"
replace sector="Health (Q)" if  NAICS2=="62"
replace sector="Recreation (R-S)" if NAICS2=="71" | NAICS2=="81"

tab sector 
compress
tempfile temp1
save `temp1', replace


*Manufacturing* 
*------------------------------
clear all
use `temp1', clear
gen sector1="NA"
replace sector1="Manufacturing (C)" if sector=="Food (10-12)"
replace sector1="Manufacturing (C)" if sector=="Textiles (13-15)"
replace sector1="Manufacturing (C)" if sector=="Wood (16-18)"
replace sector1="Manufacturing (C)" if sector=="Petroleum (19)"
replace sector1="Manufacturing (C)" if sector=="Chemicals (20-21)"
replace sector1="Manufacturing (C)" if sector=="Plastic (22-23)"
replace sector1="Manufacturing (C)" if sector=="Basic Metals (24-25)"
replace sector1="Manufacturing (C)" if sector=="Electrical_Equip (26-27)"
replace sector1="Manufacturing (C)" if sector=="Machinery (28)"
replace sector1="Manufacturing (C)" if sector=="Transport (29-30)"
replace sector1="Manufacturing (C)" if sector=="Other_Manufacturing (31-33)"


*Marketing Services
*------------------------------
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Wholesale_Retail (G)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Transportation_Storage (H)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Accomodation (I)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Information (J)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Financial_Insurance (K)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Support_Services (M-N)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Recreation (R-S)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Activity_households (T)"

*Other Goods
*------------------------------
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Agriculture (A)"
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Mining (B)"
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Electricity (D-E)"
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Construction (F)"


*Non Market Economy 
*------------------------------
replace sector1="Non-Market Economy" if sector!="NA" & sector1=="NA"


*Aggregating some sectors 
*------------------------------
replace sector="TexWood (13-18)" if sector=="Textiles (13-15)" | sector=="Wood (16-18)"
replace sector="ChePetPla (19-23)" if sector=="Chemicals (20-21)" | sector=="Petroleum (19)" | sector=="Plastic (22-23)"
replace sector="ElecMach (26-28)" if sector=="Electrical_Equip (26-27)" | sector=="Machinery (28)"
replace sector="TranspOtherManuf (29-33)" if sector=="Transport (29-30)" | sector=="Other_Manufacturing (31-33)"
replace sector="AgrMining (A-B)" if sector=="Agriculture (A)" | sector=="Mining (B)"
replace sector="Accomod_Recreat (I-R-S)" if sector=="Accomodation (I)" | sector=="Recreation (R-S)"
drop if sector=="NA" & sector1=="NA"

compress
tempfile temp2
save `temp2', replace


use `temp2', clear
foreach vvv in sales va exports {
*From USD to thusand USD 
replace `vvv'=`vvv'/1000
format %9.1fc `vvv' 
}
*From number to thousands of employees 
replace emp=emp/1000
format %9.1fc emp 

label var sales "Operating Revenue (th USD)"
label var emp "Number of employees (th)"
label var exports "Exports (th USD)"
label var va "Value Added (th USD)"

drop if isocode=="CY" | isocode=="ID" | isocode=="IL"| isocode=="MT" | isocode=="MY" | isocode=="VE" | isocode=="ZA"   

sort year isocode id_bvd sector
compress 
save "${output}/firm_orbis_sales_selected_ctry.dta", replace





****************************************************************************************************

clear all
set more off
use "${output}/firm_orbis_sales_selected_ctry.dta", clear
tempfile temp0
save `temp0', replace

foreach vv in sales va exports {
display "`vv'"
replace `vv'=`vv'/1000
replace `vv'=. if `vv'<0 
}

label var sales "Operating Revenue (MM USD)"
label var exports "Exports (MM USD)"
label var va "Value Added (MM USD)"


drop if (sales==. | sales==0) &  (emp==. | emp==0) & (va==. | va==0) &  (exports==. | exports==0)
keep year isocode id_bvd guo_bvd hq NAICS2 NAICS3 NAICS4 sector sector1 sales emp va exports con_code
order year isocode id_bvd guo_bvd hq NAICS2 NAICS3 NAICS4 sector sector1 sales emp va exports con_code

compress
save "${output}/firm_allyears_sales.dta", replace



*----------------------------------------------------------------
*Including data on labor compensation (wagebill)
*----------------------------------------------------------------
/*
C1: account of a company- headquarter of a group, aggregating all companies belonging to the group
(affiliates, subsidiaries, etc.), where the company headquarter has no unconsolidated account, C2: account of
a company-headquarter of a group, aggregating all companies belonging to the group (affiliates, subsidiaries,
etc.) where the company headquarter also presents an unconsolidated account, U1: account of a company
with no consolidated account, and U2: account of a company with a consolidated account.
*/

clear all
import delimited "${tfp_bef}/ORBIS_2018/labcomp_orbis_selected_bvdids.csv"
rename bvdidnumber id_bvd

foreach vv in numberofemployees operatingrevenueturnover sales costofemployees exportrevenue materialcosts addedvalue {
replace `vv'=. if `vv'<=0
}

drop if costofemployees==. & sales==. & operatingrevenueturnover==. & numberofemployees==. & exportrevenue==. & materialcosts==. & addedvalue==. 

*Collapse if more than one information in a given year
collapse (max) numberofemployees operatingrevenueturnover sales costofemployees exportrevenue materialcosts addedvalue, by(id_bvd consolidationcode year)
keep if consolidationcode=="U1" | consolidationcode=="U2"
tab consolidationcode
gen rank=1 if consolidationcode=="U1"
replace rank=2 if consolidationcode=="U2"
replace rank=3 if consolidationcode=="C2"
replace rank=4 if consolidationcode=="C1"
sort id_bvd year rank 
by  id_bvd year, sort: gen a=_n==1
keep if a==1
drop a 
drop rank
rename consolidationcode con_code
tab con_code

rename costofemployees labcost
rename sales sales_new
rename operatingrevenueturnover turnover_new
rename numberofemployees emp_new
rename exportrevenue exp_new
rename materialcosts material_new
rename addedvalue va_new

foreach vv in sales_new turnover_new labcost exp_new material_new va_new {
replace `vv'=`vv'/1000000
}
replace emp_new=emp_new/1000
compress

label variable  sales_new "Operating Revenue (MM USD) - new"
label variable  turnover_new "Operating Turnover (MM USD) - new"
label variable  emp_new "Number of employees (th) - new"
label variable  labcost "Labor Compensation (MM USD) - new"
label variable  exp_new "Exports (MM USD) - new"
label variable  material_new "Material Cost (MM USD) - new"
label variable  va_new "Value Added (MM USD) - new"
tempfile temp
save `temp', replace


use "${output}/firm_allyears_sales.dta", clear
*merge m:1 id_bvd using "F:\Dropbox (UBC-Umich)\Master_Dataset\Temp/BVDID_index.dta"
merge m:1 id_bvd year using `temp', keepusing(sales_new turnover_new  emp_new labcost exp_new material_new va_new)
drop if _merge==2
drop _merge
compress

gen a=0
replace a=1 if emp==. & emp_new!=.
tab a
drop a 
gen a=0
replace a=1 if va==. & va_new!=.
tab a
drop a
replace  va=va_new if va==. & va_new!=.

gen a=0
replace a=1 if sales!=. & emp!=. & va!=. & labcost!=.
tab a
drop a 

save "${output}/firm_allyears_sales.dta", replace
